Load the necessary libraries
library(car) #for regression diagnostics
library(broom) #for tidy output
library(ggfortify) #for model diagnostics
library(sjPlot) #for outputs
library(knitr) #for kable
library(effects) #for partial effects plots
library(emmeans) #for estimating marginal means
library(ggeffects) #for partial effects plots
library(MASS) #for glm.nb
library(MuMIn) #for AICc
library(DHARMa) #for residual diagnostics plots
library(performance) #for residuals diagnostics
library(see) #for plotting residuals
library(modelr) #for auxillary modelling functions
library(tidyverse) #for data wrangling
Here is a modified example from Peake and Quinn (1993). Peake and Quinn (1993) investigated the relationship between the number of individuals of invertebrates living in amongst clumps of mussels on a rocky intertidal shore and the area of those mussel clumps.
Format of peakquinn.csv data files
| AREA | INDIV |
|---|---|
| 516.00 | 18 |
| 469.06 | 60 |
| 462.25 | 57 |
| 938.60 | 100 |
| 1357.15 | 48 |
| ... | ... |
| AREA | Area of mussel clump mm2 - Predictor variable |
| INDIV | Number of individuals found within clump - Response variable |
The aim of the analysis is to investigate the relationship between mussel clump area and the number of non-mussel invertebrate individuals supported in the mussel clump.
peake = read_csv('../public/data/peakquinn.csv', trim_ws=TRUE)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## AREA = col_double(),
## INDIV = col_double()
## )
glimpse(peake)
## Rows: 25
## Columns: 2
## $ AREA <dbl> 516.00, 469.06, 462.25, 938.60, 1357.15, 1773.66, 1686.01, 1786…
## $ INDIV <dbl> 18, 60, 57, 100, 48, 118, 148, 214, 225, 283, 380, 278, 338, 27…
ggplot(peake, aes(y=INDIV, x=AREA)) +
geom_point()+
geom_smooth()
Conclusions:
ggplot(peake, aes(y=INDIV)) + geom_boxplot()
ggplot(peake, aes(y=AREA)) + geom_boxplot()
Conclusions:
INDIV) and predictor (AREA) appear to be skewed. It is not surprising that the response is skewed as it represents counts of the number of individuals. We might expect that this should follow a Poisson distribution. Poisson distributions must be bounded by 0 at the lower end and hence as the expected value (=mean and location) of a Poisson distribution approaches 0, the distribution will become more and more asymetrical. A the same time, the distribution would be expected to become narrower (have a smaller variance) - since in a Poisson distribution the mean and variance are equal.Along with modelling these data against a Poisson distribution (with a natural log link), we should probably attempt to normalise the predictor variable (AREA). Whilst linear models dont assume that the predictor variable follows a normal distribution (it is typically assumed to be a uniform distribution), it is assumed to be symetrical. In this case, a natural log transformation might help achieve this. At the same time, it might also help homogeneity of variance and linearity.
We can minick the action of using a log link and log-transformed predictor by presenting the scatterplot on log-transformed axes.
ggplot(peake, aes(y=INDIV, x=AREA)) +
geom_point()+
geom_smooth() +
scale_y_log10() +
scale_x_log10()
Conclusions:
\[ y_i \sim{} \mathcal{N}(\mu_i, \sigma^2)\\ \mu_i = \beta_0 + \beta_1 ln(x_i) \]
peake.lm <- lm(INDIV~AREA, data=peake)
autoplot(peake.lm, which=1:6)
Conclusions:
influence.measures(peake.lm)
## Influence measures of
## lm(formula = INDIV ~ AREA, data = peake) :
##
## dfb.1_ dfb.AREA dffit cov.r cook.d hat inf
## 1 -0.1999 0.13380 -0.20006 1.125 2.04e-02 0.0724
## 2 -0.1472 0.09887 -0.14731 1.150 1.12e-02 0.0728
## 3 -0.1507 0.10121 -0.15073 1.148 1.17e-02 0.0728
## 4 -0.1153 0.07466 -0.11549 1.155 6.92e-03 0.0687
## 5 -0.1881 0.11765 -0.18896 1.117 1.82e-02 0.0653
## 6 -0.1214 0.07306 -0.12237 1.142 7.75e-03 0.0622
## 7 -0.0858 0.05206 -0.08639 1.155 3.88e-03 0.0628
## 8 -0.0176 0.01059 -0.01776 1.165 1.65e-04 0.0621
## 9 -0.0509 0.02633 -0.05236 1.150 1.43e-03 0.0535
## 10 -0.0237 0.01069 -0.02504 1.148 3.28e-04 0.0489
## 11 0.0475 -0.01961 0.05096 1.141 1.35e-03 0.0470
## 12 -0.0418 0.01717 -0.04492 1.142 1.05e-03 0.0468
## 13 -0.0061 0.00221 -0.00672 1.144 2.36e-05 0.0448
## 14 -0.0453 0.01859 -0.04862 1.142 1.23e-03 0.0468
## 15 0.1641 -0.05100 0.18584 1.067 1.74e-02 0.0433
## 16 0.0472 -0.00254 0.06317 1.129 2.08e-03 0.0401
## 17 0.1764 -0.01859 0.22781 1.021 2.57e-02 0.0403
## 18 0.0542 0.01493 0.09093 1.120 4.28e-03 0.0411
## 19 0.2173 0.12011 0.43430 0.808 8.29e-02 0.0433
## 20 -0.0701 -0.02168 -0.12016 1.106 7.43e-03 0.0413
## 21 0.1849 0.63635 1.07759 0.357 3.36e-01 0.0614 *
## 22 0.0752 -0.33126 -0.39474 1.157 7.79e-02 0.1352
## 23 -0.0789 0.22611 0.25071 1.362 3.25e-02 0.2144 *
## 24 0.0853 -0.21744 -0.23573 1.473 2.88e-02 0.2681 *
## 25 0.4958 -1.32133 -1.44477 0.865 8.44e-01 0.2445 *
performance::check_model(peake.lm)
## Not enough model terms in the conditional part of the model to check for multicollinearity.
Conclusions:
peake.resid <- simulateResiduals(peake.lm, plot=TRUE)
testResiduals(peake.resid)
## $uniformity
##
## One-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.18, p-value = 0.3927
## alternative hypothesis: two-sided
##
##
## $dispersion
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## ratioObsSim = 0.99227, p-value = 0.952
## alternative hypothesis: two.sided
##
##
## $outliers
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 1, observations = 25, p-value = 0.52
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.000 0.061
## sample estimates:
## outlier frequency (expected: 0.0116 )
## 0.04
## $uniformity
##
## One-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.18, p-value = 0.3927
## alternative hypothesis: two-sided
##
##
## $dispersion
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## ratioObsSim = 0.99227, p-value = 0.952
## alternative hypothesis: two.sided
##
##
## $outliers
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 1, observations = 25, p-value = 0.52
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.000 0.061
## sample estimates:
## outlier frequency (expected: 0.0116 )
## 0.04
Conclusions:
Please go to the Poisson (GLM) tab.
\[ y_i \sim{} \mathcal{Pois}(\lambda_i)\\ ln(\lambda_i) = \beta_0 + \beta_1 ln(x_i) \]
peake.glm <- glm(INDIV ~ log(AREA), data=peake, family=poisson(link='log'))
autoplot(peake.glm, which=1:6)
Conclusions:
influence.measures(peake.glm)
## Influence measures of
## glm(formula = INDIV ~ log(AREA), family = poisson(link = "log"), data = peake) :
##
## dfb.1_ dfb.l.AR dffit cov.r cook.d hat inf
## 1 -0.27480 0.26720 -0.2810 1.073 1.8302 0.0710 *
## 2 -0.04611 0.04488 -0.0471 1.173 0.0736 0.0705
## 3 -0.05602 0.05453 -0.0572 1.171 0.1071 0.0704
## 4 -0.04819 0.04647 -0.0502 1.174 0.0845 0.0720
## 5 -0.31731 0.30365 -0.3375 1.028 2.7532 0.0698 *
## 6 -0.14873 0.14126 -0.1620 1.133 0.7980 0.0668 *
## 7 -0.05948 0.05659 -0.0645 1.166 0.1386 0.0675
## 8 0.07488 -0.07110 0.0816 1.161 0.2477 0.0667
## 9 -0.06040 0.05587 -0.0727 1.151 0.1760 0.0575
## 10 -0.03779 0.03419 -0.0500 1.149 0.0848 0.0527
## 11 0.04660 -0.04160 0.0653 1.143 0.1555 0.0508
## 12 -0.06833 0.06095 -0.0961 1.133 0.3023 0.0507
## 13 -0.02679 0.02345 -0.0408 1.146 0.0569 0.0489
## 14 -0.07303 0.06515 -0.1027 1.131 0.3433 0.0507
## 15 0.14202 -0.12170 0.2359 1.040 2.1279 0.0477 *
## 16 0.01114 -0.00816 0.0302 1.145 0.0327 0.0467
## 17 0.10458 -0.07980 0.2557 1.018 2.4750 0.0465 *
## 18 0.00878 -0.00353 0.0508 1.145 0.0930 0.0501
## 19 0.03006 0.01642 0.4473 0.857 7.2424 0.0536 *
## 20 -0.04114 0.01412 -0.2614 1.028 1.9833 0.0505 *
## 21 -0.21773 0.31062 0.9321 0.530 25.9624 0.0742 *
## 22 0.27322 -0.31834 -0.5255 1.087 8.0430 0.1366 *
## 23 -0.06181 0.06967 0.1003 1.347 0.3584 0.1916 *
## 24 0.16956 -0.18902 -0.2594 1.382 2.2615 0.2255 *
## 25 0.84423 -0.94513 -1.3210 0.824 39.5373 0.2109 *
performance::check_model(peake.glm)
## Not enough model terms in the conditional part of the model to check for multicollinearity.
performance::check_overdispersion(peake.glm)
## # Overdispersion test
##
## dispersion ratio = 70.410
## Pearson's Chi-Squared = 1619.441
## p-value = < 0.001
performance::check_zeroinflation(peake.glm)
## Model has no observed zeros in the response variable.
## NULL
performance::check_normality(peake.glm) %>% plot
## OK: residuals appear as normally distributed (p = 0.103).
performance::check_outliers(peake.glm)
## Warning: 11 outliers detected (cases 1, 5, 6, 15, 17, 19, 20, 21, 22, 24, 25).
Conclusions:
peake.resid <- simulateResiduals(peake.glm, plot=TRUE)
testResiduals(peake.resid)
## $uniformity
##
## One-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.54099, p-value = 8.826e-07
## alternative hypothesis: two-sided
##
##
## $dispersion
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## ratioObsSim = 10.446, p-value < 2.2e-16
## alternative hypothesis: two.sided
##
##
## $outliers
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 13, observations = 25, p-value < 2.2e-16
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.00 0.08
## sample estimates:
## outlier frequency (expected: 0.012 )
## 0.52
## $uniformity
##
## One-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.54099, p-value = 8.826e-07
## alternative hypothesis: two-sided
##
##
## $dispersion
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## ratioObsSim = 10.446, p-value < 2.2e-16
## alternative hypothesis: two.sided
##
##
## $outliers
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 13, observations = 25, p-value < 2.2e-16
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.00 0.08
## sample estimates:
## outlier frequency (expected: 0.012 )
## 0.52
Conclusions:
##Check the model for lack of fit via:
##Pearson chisq
peake.ss <- sum(resid(peake.glm, type = "pearson")^2)
1 - pchisq(peake.ss, peake.glm$df.resid)
## [1] 0
##Evidence of a lack of fit
#Deviance
1-pchisq(peake.glm$deviance, peake.glm$df.resid)
## [1] 0
#Evidence of a lack of fit
peake.ss/peake.glm$df.resid
## [1] 70.41047
peake.glm$deviance/peake.glm$df.resid
## [1] 67.29376
The model appears to be overdispersed. That is, the Poisson distribution would expect (assume) that the variance is equal to the mean. The diagnostics suggest that there is more variability in the response than the Poisson model would expect.
This could be due to:
the model being too simple. Linear models are intentionally low-dimensional representations of the system. They do not, and can not represent all the complexity in the underlying system. Instead they are intended to provide very targetted insights into a small number of potential influences. Nevertheless, the model might still be too simplistic. Perhaps if we were able to add an additional covariate, we might be able to explain the additional variation. For example, in this example, although it might be reasonable to expect that the number of individuals in a mussel clump should be driven by a Poisson process, the mussel clump area alone might not be sufficient to capture the variance reasonably. Perhaps if we were able to include additional information, such as the position of the clump along the shore (tidal influence) or orientation on the rock etc, we might be able to explain more of the currently unexplained variation.
in the absence of additional covariates, it is possible to add a special type of dummy covariate that is like a proxy for all additional covariates that we could add. This dummy variable is called a unit-level (or Observation-level) random effect. It essentiall soaks up the additional variance.
another source of additional variation is when the objects being counted have a tendency to aggregate (clumped). When this is the case, the sampled data tends to be more varied since any single sample is likely to either less or more than the overall average number of items. Hence the average of the samples might turn out to be similar to a more homogenous population, yet it will have higher variance.
The Negative Binomial distribution is able to accommodate clumpy data. Rather than assume that the variance and mean are equal (disperion of 1), it estimates dispersion as an additional parameter. Together the two paramters of the Negative Binomial can be used to estimate the location (mean) and spread (variance) of the distribution.
yet another cause of overdispersion is the presence of excessive zeros. In particular, some of these zeros could be false zeros. That is, a zero was recorded (no individuals observed), yet there one or more individuals were actually present (just not detected). This could imply that the observed data are generated by two processes. One that determines the actual number of items that exist and then another that determines the detectibility of the items. For these circumstances, we can fit zero-inflated models. Zero-inflated models are a mixture of two models, one representing the count process and the other representing the imperfect detection process.
Please go to the Negative Binomial (GLM) tab.
\[ y_i \sim{} \mathcal{NegBin}(\lambda_i, \theta)\\ ln(\lambda_i) = \beta_0 + \beta_1 ln(x_i) \]
peake.glm1 <- glm.nb(INDIV ~ log(AREA), data=peake)
## lets also fit a model in which we have centered the predictor to see the impact on estimated coefficients.
peake.glm2 <- glm.nb(INDIV ~ scale(log(AREA), scale=FALSE), data=peake)
autoplot(peake.glm1, which=1:6)
Conclusions:
influence.measures(peake.glm1)
## Influence measures of
## glm.nb(formula = INDIV ~ log(AREA), data = peake, init.theta = 7.369263003, link = log) :
##
## dfb.1_ dfb.l.AR dffit cov.r cook.d hat inf
## 1 -1.054875 0.987692 -1.1293 0.748 0.313001 0.1554 *
## 2 0.206875 -0.194302 0.2200 1.279 0.031769 0.1645 *
## 3 0.162776 -0.152952 0.1729 1.293 0.019220 0.1659 *
## 4 0.096225 -0.087632 0.1105 1.207 0.007770 0.1033
## 5 -0.505155 0.446373 -0.6335 0.801 0.113572 0.0777
## 6 -0.105801 0.090193 -0.1480 1.133 0.010967 0.0629
## 7 0.016821 -0.014457 0.0230 1.169 0.000318 0.0655
## 8 0.179367 -0.152721 0.2519 1.071 0.045529 0.0626
## 9 -0.010516 0.007231 -0.0249 1.142 0.000356 0.0440
## 10 -0.003083 0.001214 -0.0134 1.139 0.000105 0.0408
## 11 0.013908 -0.000181 0.0976 1.116 0.006282 0.0406
## 12 -0.009512 -0.000223 -0.0692 1.128 0.002572 0.0406
## 13 -0.000763 -0.001702 -0.0175 1.139 0.000177 0.0411
## 14 -0.010506 -0.000234 -0.0764 1.125 0.003099 0.0406
## 15 -0.008622 0.042042 0.2384 1.018 0.041811 0.0420
## 16 -0.006407 0.009578 0.0239 1.148 0.000345 0.0488
## 17 -0.054463 0.085385 0.2303 1.044 0.038438 0.0474
## 18 -0.009648 0.012719 0.0245 1.157 0.000363 0.0561
## 19 -0.145174 0.184432 0.3238 1.009 0.078107 0.0608
## 20 0.114479 -0.149993 -0.2847 1.029 0.033283 0.0568
## 21 -0.282767 0.335956 0.4884 0.932 0.183075 0.0781
## 22 0.304509 -0.346099 -0.4398 1.066 0.077348 0.1084
## 23 0.089374 -0.100144 -0.1219 1.240 0.008049 0.1270
## 24 0.221731 -0.247052 -0.2958 1.205 0.041771 0.1367
## 25 0.579077 -0.646655 -0.7793 0.904 0.187567 0.1326
performance::check_model(peake.glm1)
## Not enough model terms in the conditional part of the model to check for multicollinearity.
Conclusions:
peake.resid <- simulateResiduals(peake.glm1, plot=TRUE)
testResiduals(peake.resid)
## $uniformity
##
## One-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.11352, p-value = 0.8684
## alternative hypothesis: two-sided
##
##
## $dispersion
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## ratioObsSim = 1.1355, p-value = 0.496
## alternative hypothesis: two.sided
##
##
## $outliers
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 0, observations = 25, p-value = 1
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.00 0.08
## sample estimates:
## outlier frequency (expected: 0.0136 )
## 0
## $uniformity
##
## One-sample Kolmogorov-Smirnov test
##
## data: simulationOutput$scaledResiduals
## D = 0.11352, p-value = 0.8684
## alternative hypothesis: two-sided
##
##
## $dispersion
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## ratioObsSim = 1.1355, p-value = 0.496
## alternative hypothesis: two.sided
##
##
## $outliers
##
## DHARMa bootstrapped outlier test
##
## data: simulationOutput
## outliers at both margin(s) = 0, observations = 25, p-value = 1
## alternative hypothesis: two.sided
## percent confidence interval:
## 0.00 0.08
## sample estimates:
## outlier frequency (expected: 0.0136 )
## 0
Conclusions:
##Check the model for lack of fit via:
##Pearson chisq
peake.ss <- sum(resid(peake.glm1, type = "pearson")^2)
1 - pchisq(peake.ss, peake.glm1$df.resid)
## [1] 0.4897527
##Evidence of a lack of fit
#Deviance
1-pchisq(peake.glm1$deviance, peake.glm1$df.resid)
## [1] 0.3036601
#Evidence of a lack of fit
Conclusions:
peake.ss/peake.glm1$df.resid
## [1] 0.9786343
Conclusions:
The model diagnostics suggest that the Negative Binomial model is more appropriate - since it satisfied the assumptions whereas the Poisson did not. In this case, this should be the overriding factor in selecting between the two models. If however, both were found to be valid, we could use information criteria to help us chose between the two.
Information criterion provide a relative measure of the fit of the model penalizing for complexity (e.i. a balance between goodness of fit and model simplicity). The actual value of an AIC by itself is of no real use. It is only useful as a comparative measure between two or more related models. For this purpose, the lower AIC is considered better.
One of the most widely used information criterion is the Akaike Information Criterion (AIC). The AIC is a measure of in-sample prediction error that penalises complexity:
\[ AIC = 2k - 2ln(\hat{L}) \] where \(k\) is the number of parameters being estimated and \(\hat{L}\) is the maximum liklihood of the fitted model.
As a general rule, if two AIC’s are within 2 units of each other, they are considered to be not significantly different from one another. In that case, you would select the model that is simplest (lower used degrees of freedom).
AIC(peake.glm, peake.glm1)
## For small sample sizes, it is better to use AICc - this is
## corrected for small sample sizes.
AICc(peake.glm, peake.glm1)
Conclusions:
Unfortunately, these seem to be broken at the moment…
## The following is equivalent to ggeffect
plot_model(peake.glm1, type='eff', show.data=FALSE, terms='AREA')
#plot_model(peake.glm1, type='eff', show.data=FALSE, terms='AREA [exp]')
## The following is equivalent to ggpredict
#plot_model(peake.glm1, type='pred', show_data=TRUE, terms='AREA [exp]')
## The following is equivalent to ggemmeans
#plot_model(peake.glm1, type='emm', terms='AREA [exp]')
plot(allEffects(peake.glm1, residuals=TRUE), type='response')
ggpredict(peake.glm1) %>% plot(add.data=TRUE)
## $AREA
ggemmeans(peake.glm1, ~AREA) %>% plot(add.data=TRUE)
summary(peake.glm1)
##
## Call:
## glm.nb(formula = INDIV ~ log(AREA), data = peake, init.theta = 7.369263003,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.29133 -0.59599 -0.06927 0.48929 1.64734
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.16452 0.53387 -2.181 0.0292 *
## log(AREA) 0.82469 0.06295 13.102 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(7.3693) family taken to be 1)
##
## Null deviance: 161.076 on 24 degrees of freedom
## Residual deviance: 25.941 on 23 degrees of freedom
## AIC: 312.16
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 7.37
## Std. Err.: 2.16
##
## 2 x log-likelihood: -306.16
Conclusions:
glm.nb(INDIV~scale(log(AREA),scale=FALSE), data=peake)), then the y-intercept would have had a sensible interpretation. It would have been the expected Individuals at the average climp Area. Nevertheless, either way, the hypothesis test associated with the intercept is rarely of any value.confint(peake.glm1)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -2.257255 -0.04087877
## log(AREA) 0.693092 0.95477695
## or on the response scale
exp(confint(peake.glm1))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1046373 0.9599455
## log(AREA) 1.9998896 2.5980910
Conclusions:
tidy(peake.glm1, conf.int=TRUE)
tidy(peake.glm1, conf.int=TRUE, exponentiate=TRUE)
glance(peake.glm1)
# warning this is only appropriate for html output
sjPlot::tab_model(peake.glm1,show.se=TRUE,show.aic=TRUE)
| INDIV | ||||
|---|---|---|---|---|
| Predictors | Incidence Rate Ratios | std. Error | CI | p |
| (Intercept) | 0.31 | 0.53 | 0.10 – 0.96 | 0.029 |
| AREA [log] | 2.28 | 0.06 | 2.00 – 2.60 | <0.001 |
| Observations | 25 | |||
| R2 Nagelkerke | 0.997 | |||
| AIC | 312.160 | |||
An \(R^{2}\) (coefficient of determination) is used as a measure of strenth or goodness of fit. For Ordinary Least Squares (OLS), it is calculated as:
\[ R^2 = 1 - \frac{\sum_i=1^N (y_i - \hat(y_i)^2)}{\sum_i=1^N (y_i - \bar(y))^2} \] where \(y_{i}\) and \(\hat{y_{i}}\) are the \(i^{th}\) observed and predicted value respectively, \(\bar{y}\) is the mean of the observed values and \(N\) is the total number of observations.
This is really only appropriate for OLS. For other models there are alternative measures (psuedo R-squared) that can be appled depending on how they are to be interpreted:
There are many different ways to calculate \(R^2\) values from GLM’s. The following table provides some guidance as to which method is appropriate for which type of model and how they should be interpreted..
| Model | Appropriate \(R^2\) | Formula | Interpreted as |
|---|---|---|---|
| Logisitic | Tjur’s R2 | \(R^2=\frac{1}{n_{1}}\sum \hat{\pi}(y=1) - \frac{1}{n_{0}}\sum \hat{\pi}(y=0)\) | |
| Multinomial Logit | McFadden’s R2 | \(R^2=1-\frac{logL(x)}{logL(0)}\) | 1 and 2 |
| GLM | Nagelkerke’s R2 | \(R^2=\frac{1-(\frac{logL(0)}{logL(x)})^{2/N}}{1-logl(0)^{2/N}}\) | 2 |
| Mixed models | Nakagawa’s R2 | Too complex | |
| ZI models | Zero-inflated R2 | Too complex | |
| Bayesian models | Bayes R2 | Too complex |
where \(n_1\) and \(n_0\) are the number of 1’s and 0’s in the response and \(\hat{\pi}\) is the predicted probability. \(logL(x)\) and \(logL(0)\) are the log-likelihoods of the fitted and null models respectively and \(N\) is the number of observations.
#R2
1-(peake.ss/peake.glm1$null)
## [1] 0.8602608
## Or based on deviance (preferred)
1-(peake.glm1$deviance/peake.glm1$null)
## [1] 0.8389516
\[ R^2 = 1 - exp(-2/n * logL(x) - logL(0)) \] where \(logL(x)\) and \(logL(0)\) are the log-likelihoods of the fitted and null models respectively. This is then sometimes adjusted (Nagelkerke’s method) such that: \[ max(R^2) = 1 - exp(2 / n * logL(0)) \] because sometimes the max R^2$ is less than one.
r.squaredLR(peake.glm1)
## [1] 0.855129
## attr(,"adj.r.squared")
## [1] 0.8551296
performance::r2_nagelkerke(peake.glm1)
## Nagelkerke's R2
## 0.9970946
## Using emmeans
peake.grid = with(peake, list(AREA=seq(min(AREA), max(AREA), len=100)))
#OR
peake.grid = peake %>% data_grid(AREA=seq_range(AREA, n=100))
newdata = emmeans(peake.glm1, ~AREA, at=peake.grid, type='response') %>%
as.data.frame
head(newdata)
ggplot(newdata, aes(y=response, x=AREA)) +
geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL),fill='blue', alpha=0.3) +
geom_line() +
theme_classic()
ggplot(newdata, aes(y=response, x=AREA)) +
geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL),fill='blue', alpha=0.3) +
geom_line() +
geom_point(data=peake, aes(y=INDIV)) +
scale_x_log10(breaks = as.vector(c(1,2,5,10) %o% 10^(-1:4))) +
scale_y_log10() +
theme_classic()
## If we want to plot the partial observations
partial.obs = emmeans(peake.glm1, ~AREA, at=peake, type='response') %>%
as.data.frame %>%
mutate(response=response+resid(peake.glm1,type='response'))
## response residuals are just resid * mu.eta(predict)
ggplot(newdata, aes(y=response, x=AREA)) +
geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL),fill='blue', alpha=0.3) +
geom_line() +
geom_point(data=peake, aes(y=INDIV)) +
geom_point(data=partial.obs, aes(y=response), color='green') +
scale_x_log10(breaks = as.vector(c(1,2,5,10) %o% 10^(-1:4))) +
scale_y_log10() +
theme_classic()
Peake, A. J., and G. P. Quinn. 1993. “Temporal Variation in Species-Area Curves for Invertebrates in Clumps of an Intertidal Mussel.” Ecography 16: 269–77.